Completed
Pull Request — master (#155)
by Christophe
02:21
created

Tokenizer::rcdata()   B

Complexity

Conditions 7
Paths 7

Size

Total Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 7.0957

Importance

Changes 0
Metric Value
dl 0
loc 32
ccs 21
cts 24
cp 0.875
rs 8.4746
c 0
b 0
f 0
cc 7
nc 7
nop 1
crap 7.0957
1
<?php
2
3
namespace Masterminds\HTML5\Parser;
4
5
use Masterminds\HTML5\Elements;
6
7
/**
8
 * The HTML5 tokenizer.
9
 *
10
 * The tokenizer's role is reading data from the scanner and gathering it into
11
 * semantic units. From the tokenizer, data is emitted to an event handler,
12
 * which may (for example) create a DOM tree.
13
 *
14
 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We
15
 * follow that specification to the maximum extent that we can. If you find
16
 * a discrepancy that is not documented, please file a bug and/or submit a
17
 * patch.
18
 *
19
 * This tokenizer is implemented as a recursive descent parser.
20
 *
21
 * Within the API documentation, you may see references to the specific section
22
 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1.
23
 * This refers to section 8.2.4.1 of the HTML5 CR specification.
24
 *
25
 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
26
 */
27
class Tokenizer
28
{
29
    protected $scanner;
30
31
    protected $events;
32
33
    protected $tok;
34
35
    /**
36
     * Buffer for text.
37
     */
38
    protected $text = '';
39
40
    // When this goes to false, the parser stops.
41
    protected $carryOn = true;
42
43
    protected $textMode = 0; // TEXTMODE_NORMAL;
44
    protected $untilTag = null;
45
46
    const CONFORMANT_XML = 'xml';
47
    const CONFORMANT_HTML = 'html';
48
    protected $mode = self::CONFORMANT_HTML;
49
50
    const WHITE = "\t\n\f ";
51
52
    /**
53
     * Create a new tokenizer.
54
     *
55
     * Typically, parsing a document involves creating a new tokenizer, giving
56
     * it a scanner (input) and an event handler (output), and then calling
57
     * the Tokenizer::parse() method.`
58
     *
59
     * @param Scanner      $scanner      A scanner initialized with an input stream.
60
     * @param EventHandler $eventHandler An event handler, initialized and ready to receive events.
61
     * @param string       $mode
62
     */
63 127
    public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
64
    {
65 127
        $this->scanner = $scanner;
66 127
        $this->events = $eventHandler;
67 127
        $this->mode = $mode;
68 127
    }
69
70
    /**
71
     * Begin parsing.
72
     *
73
     * This will begin scanning the document, tokenizing as it goes.
74
     * Tokens are emitted into the event handler.
75
     *
76
     * Tokenizing will continue until the document is completely
77
     * read. Errors are emitted into the event handler, but
78
     * the parser will attempt to continue parsing until the
79
     * entire input stream is read.
80
     */
81 127
    public function parse()
82
    {
83
        do {
84 127
            $this->consumeData();
85
            // FIXME: Add infinite loop protection.
86 127
        } while ($this->carryOn);
87 127
    }
88
89
    /**
90
     * Set the text mode for the character data reader.
91
     *
92
     * HTML5 defines three different modes for reading text:
93
     * - Normal: Read until a tag is encountered.
94
     * - RCDATA: Read until a tag is encountered, but skip a few otherwise-
95
     * special characters.
96
     * - Raw: Read until a special closing tag is encountered (viz. pre, script)
97
     *
98
     * This allows those modes to be set.
99
     *
100
     * Normally, setting is done by the event handler via a special return code on
101
     * startTag(), but it can also be set manually using this function.
102
     *
103
     * @param int    $textmode One of Elements::TEXT_*.
104
     * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not
105
     *                         use this indicator.
106
     */
107 108
    public function setTextMode($textmode, $untilTag = null)
108
    {
109 108
        $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
110 108
        $this->untilTag = $untilTag;
111 108
    }
112
113
    /**
114
     * Consume a character and make a move.
115
     * HTML5 8.2.4.1.
116
     */
117 127
    protected function consumeData()
118
    {
119 127
        $tok = $this->scanner->current();
120
121 127
        if ('&' === $tok) {
122
            // Character reference
123 8
            $ref = $this->decodeCharacterReference();
124 8
            $this->buffer($ref);
125
126 8
            $tok = $this->scanner->current();
127 8
        }
128
129
        // Parse tag
130 127
        if ('<' === $tok) {
131
            // Any buffered text data can go out now.
132 123
            $this->flushBuffer();
133
134 123
            $tok = $this->scanner->next();
135
136 123
            $this->markupDeclaration($tok)
137 120
                || $this->endTag()
138 120
                || $this->processingInstruction()
139 119
                || $this->tagName()
140
                // This always returns false.
141 114
                || $this->parseError('Illegal tag opening')
142 1
                || $this->characterData();
143
144 123
            $tok = $this->scanner->current();
145 123
        }
146
147
        // Handle end of document
148 127
        $this->eof($tok);
149
150
        // Parse character
151 127
        if (false !== $tok) {
152 112
            switch ($this->textMode) {
153 112
                case Elements::TEXT_RAW:
154 8
                    $this->rawText($tok);
155 8
                    break;
156
157 112
                case Elements::TEXT_RCDATA:
158 37
                    $this->rcdata($tok);
159 37
                    break;
160
161 111
                default:
162 111
                    if (!strspn($tok, '<&')) {
163
                        // NULL character
164 87
                        if ("\00" === $tok) {
165
                            $this->parseError('Received null character.');
166
                        }
167
168 87
                        $this->text .= $tok;
169 87
                        $this->scanner->next();
170 87
                    }
171 112
            }
172 112
        }
173
174 127
        return $this->carryOn;
175
    }
176
177
    /**
178
     * Parse anything that looks like character data.
179
     *
180
     * Different rules apply based on the current text mode.
181
     *
182
     * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
183
     */
184 1
    protected function characterData()
185
    {
186 1
        $tok = $this->scanner->current();
187 1
        if (false === $tok) {
188
            return false;
189
        }
190 1
        switch ($this->textMode) {
191 1
            case Elements::TEXT_RAW:
192
                return $this->rawText($tok);
193 1
            case Elements::TEXT_RCDATA:
194
                return $this->rcdata($tok);
195 1
            default:
196 1
                if (strspn($tok, '<&')) {
197
                    return false;
198
                }
199
200 1
                return $this->text($tok);
201 1
        }
202
    }
203
204
    /**
205
     * This buffers the current token as character data.
206
     *
207
     * @param string $tok The current token.
208
     *
209
     * @return bool
210
     */
211 1
    protected function text($tok)
212
    {
213
        // This should never happen...
214 1
        if (false === $tok) {
215
            return false;
216
        }
217
218
        // NULL character
219 1
        if ("\00" === $tok) {
220
            $this->parseError('Received null character.');
221
        }
222
223 1
        $this->buffer($tok);
224 1
        $this->scanner->next();
225
226 1
        return true;
227
    }
228
229
    /**
230
     * Read text in RAW mode.
231
     *
232
     * @param string $tok The current token.
233
     *
234
     * @return bool
235
     */
236 8
    protected function rawText($tok)
237
    {
238 8
        if (is_null($this->untilTag)) {
239
            return $this->text($tok);
240
        }
241
242 8
        $sequence = '</' . $this->untilTag . '>';
243 8
        $txt = $this->readUntilSequence($sequence);
244 8
        $this->events->text($txt);
245 8
        $this->setTextMode(0);
246
247 8
        return $this->endTag();
248
    }
249
250
    /**
251
     * Read text in RCDATA mode.
252
     *
253
     * @param string $tok The current token.
254
     *
255
     * @return bool
256
     */
257 37
    protected function rcdata($tok)
258
    {
259 37
        if (is_null($this->untilTag)) {
260
            return $this->text($tok);
261
        }
262
263 37
        $sequence = '</' . $this->untilTag;
264 37
        $txt = '';
265
266 37
        $caseSensitive = !Elements::isHtml5Element($this->untilTag);
267 37
        while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
268 35
            if ('&' == $tok) {
269 1
                $txt .= $this->decodeCharacterReference();
270 1
                $tok = $this->scanner->current();
271 1
            } else {
272 35
                $txt .= $tok;
273 35
                $tok = $this->scanner->next();
274
            }
275 35
        }
276 37
        $len = strlen($sequence);
277 37
        $this->scanner->consume($len);
278 37
        $len += $this->scanner->whitespace();
279 37
        if ('>' !== $this->scanner->current()) {
280
            $this->parseError('Unclosed RCDATA end tag');
281
        }
282
283 37
        $this->scanner->unconsume($len);
284 37
        $this->events->text($txt);
285 37
        $this->setTextMode(0);
286
287 37
        return $this->endTag();
288
    }
289
290
    /**
291
     * If the document is read, emit an EOF event.
292
     */
293 127
    protected function eof($tok)
294
    {
295 127
        if (false === $tok) {
296
            // fprintf(STDOUT, "EOF");
297 127
            $this->flushBuffer();
298 127
            $this->events->eof();
299 127
            $this->carryOn = false;
300
301 127
            return true;
302
        }
303
304 112
        return false;
305
    }
306
307
    /**
308
     * Look for markup.
309
     */
310 123
    protected function markupDeclaration($tok)
311
    {
312 123
        if ('!' != $tok) {
313 120
            return false;
314
        }
315
316 101
        $tok = $this->scanner->next();
317
318
        // Comment:
319 101
        if ('-' == $tok && '-' == $this->scanner->peek()) {
320 6
            $this->scanner->next(); // Consume the other '-'
321 6
            $this->scanner->next(); // Next char.
322 6
            return $this->comment();
323 98
        } elseif ('D' == $tok || 'd' == $tok) { // Doctype
324 96
            return $this->doctype();
325 7
        } elseif ('[' == $tok) { // CDATA section
326 7
            return $this->cdataSection();
327
        }
328
329
        // FINISH
330 1
        $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok);
331 1
        $this->bogusComment('<!');
332
333 1
        return true;
334
    }
335
336
    /**
337
     * Consume an end tag. See section 8.2.4.9.
338
     */
339 120
    protected function endTag()
340
    {
341 120
        if ('/' != $this->scanner->current()) {
342 119
            return false;
343
        }
344 111
        $tok = $this->scanner->next();
345
346
        // a-zA-Z -> tagname
347
        // > -> parse error
348
        // EOF -> parse error
349
        // -> parse error
350 111
        if (!ctype_alpha($tok)) {
351 2
            $this->parseError("Expected tag name, got '%s'", $tok);
352 2
            if ("\0" == $tok || false === $tok) {
353
                return false;
354
            }
355
356 2
            return $this->bogusComment('</');
357
        }
358
359 110
        $name = $this->scanner->charsUntil("\n\f \t>");
360 110
        $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
361
        // Trash whitespace.
362 110
        $this->scanner->whitespace();
363
364 110
        $tok = $this->scanner->current();
365 110
        if ('>' != $tok) {
366 1
            $this->parseError("Expected >, got '%s'", $tok);
367
            // We just trash stuff until we get to the next tag close.
368 1
            $this->scanner->charsUntil('>');
369 1
        }
370
371 110
        $this->events->endTag($name);
372 110
        $this->scanner->next();
373
374 110
        return true;
375
    }
376
377
    /**
378
     * Consume a tag name and body. See section 8.2.4.10.
379
     */
380 114
    protected function tagName()
381
    {
382 114
        $tok = $this->scanner->current();
383 114
        if (!ctype_alpha($tok)) {
384 1
            return false;
385
        }
386
387
        // We know this is at least one char.
388 114
        $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz');
389 114
        $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
390 114
        $attributes = array();
391 114
        $selfClose = false;
392
393
        // Handle attribute parse exceptions here so that we can
394
        // react by trying to build a sensible parse tree.
395
        try {
396
            do {
397 114
                $this->scanner->whitespace();
398 114
                $this->attribute($attributes);
399 114
            } while (!$this->isTagEnd($selfClose));
400 114
        } catch (ParseError $e) {
401 2
            $selfClose = false;
402
        }
403
404 114
        $mode = $this->events->startTag($name, $attributes, $selfClose);
405
406 114
        if (is_int($mode)) {
407 107
            $this->setTextMode($mode, $name);
408 107
        }
409
410 114
        $this->scanner->next();
411
412 114
        return true;
413
    }
414
415
    /**
416
     * Check if the scanner has reached the end of a tag.
417
     */
418 114
    protected function isTagEnd(&$selfClose)
419
    {
420 114
        $tok = $this->scanner->current();
421 114
        if ('/' == $tok) {
422 15
            $this->scanner->next();
423 15
            $this->scanner->whitespace();
424 15
            $tok = $this->scanner->current();
425
426 15
            if ('>' == $tok) {
427 15
                $selfClose = true;
428
429 15
                return true;
430
            }
431 2
            if (false === $tok) {
432 1
                $this->parseError('Unexpected EOF inside of tag.');
433
434 1
                return true;
435
            }
436
            // Basically, we skip the / token and go on.
437
            // See 8.2.4.43.
438 1
            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
439
440 1
            return false;
441
        }
442
443 114
        if ('>' == $tok) {
444 114
            return true;
445
        }
446 32
        if (false === $tok) {
447 2
            $this->parseError('Unexpected EOF inside of tag.');
448
449 2
            return true;
450
        }
451
452 31
        return false;
453
    }
454
455
    /**
456
     * Parse attributes from inside of a tag.
457
     *
458
     * @param string[] $attributes
459
     *
460
     * @return bool
461
     *
462
     * @throws ParseError
463
     */
464 114
    protected function attribute(&$attributes)
465
    {
466 114
        $tok = $this->scanner->current();
467 114
        if ('/' == $tok || '>' == $tok || false === $tok) {
468 108
            return false;
469
        }
470
471 82
        if ('<' == $tok) {
472 2
            $this->parseError("Unexpected '<' inside of attributes list.");
473
            // Push the < back onto the stack.
474 2
            $this->scanner->unconsume();
475
            // Let the caller figure out how to handle this.
476 2
            throw new ParseError('Start tag inside of attribute.');
477
        }
478
479 82
        $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
480
481 82
        if (0 == strlen($name)) {
482 3
            $tok = $this->scanner->current();
483 3
            $this->parseError('Expected an attribute name, got %s.', $tok);
484
            // Really, only '=' can be the char here. Everything else gets absorbed
485
            // under one rule or another.
486 3
            $name = $tok;
487 3
            $this->scanner->next();
488 3
        }
489
490 82
        $isValidAttribute = true;
491
        // Attribute names can contain most Unicode characters for HTML5.
492
        // But method "DOMElement::setAttribute" is throwing exception
493
        // because of it's own internal restriction so these have to be filtered.
494
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
495
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
496 82
        if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
497 4
            $this->parseError('Unexpected characters in attribute name: %s', $name);
498 4
            $isValidAttribute = false;
499 4
        }         // There is no limitation for 1st character in HTML5.
500
        // But method "DOMElement::setAttribute" is throwing exception for the
501
        // characters below so they have to be filtered.
502
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
503
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
504 79
        elseif (preg_match('/^[0-9.-]/u', $name)) {
505 1
            $this->parseError('Unexpected character at the begining of attribute name: %s', $name);
506 1
            $isValidAttribute = false;
507 1
        }
508
        // 8.1.2.3
509 82
        $this->scanner->whitespace();
510
511 82
        $val = $this->attributeValue();
512 82
        if ($isValidAttribute) {
513 79
            $attributes[$name] = $val;
514 79
        }
515
516 82
        return true;
517
    }
518
519
    /**
520
     * Consume an attribute value. See section 8.2.4.37 and after.
521
     *
522
     * @return string|null
523
     */
524 82
    protected function attributeValue()
525
    {
526 82
        if ('=' != $this->scanner->current()) {
527 13
            return null;
528
        }
529 78
        $this->scanner->next();
530
        // 8.1.2.3
531 78
        $this->scanner->whitespace();
532
533 78
        $tok = $this->scanner->current();
534
        switch ($tok) {
535 78
            case "\n":
536 78
            case "\f":
537 78
            case ' ':
538 78
            case "\t":
539
                // Whitespace here indicates an empty value.
540
                return null;
541 78
            case '"':
542 78
            case "'":
543 78
                $this->scanner->next();
544
545 78
                return $this->quotedAttributeValue($tok);
0 ignored issues
show
Security Bug introduced by
It seems like $tok defined by $this->scanner->current() on line 533 can also be of type false; however, Masterminds\HTML5\Parser...:quotedAttributeValue() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
546 1
            case '>':
547
                // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
548 1
                $this->parseError('Expected attribute value, got tag end.');
549
550 1
                return null;
551 1
            case '=':
552 1
            case '`':
553
                $this->parseError('Expecting quotes, got %s.', $tok);
554
555
                return $this->unquotedAttributeValue();
556 1
            default:
557 1
                return $this->unquotedAttributeValue();
558 1
        }
559
    }
560
561
    /**
562
     * Get an attribute value string.
563
     *
564
     * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered
565
     *                      termination of an attribute's value. E.g. "\"'" will stop at either
566
     *                      ' or ".
567
     *
568
     * @return string The attribute value.
569
     */
570 78
    protected function quotedAttributeValue($quote)
571
    {
572 78
        $stoplist = "\f" . $quote;
573 78
        $val = '';
574
575 78
        while (true) {
576 78
            $tokens = $this->scanner->charsUntil($stoplist . '&');
577 78
            if (false !== $tokens) {
578 78
                $val .= $tokens;
579 78
            } else {
580
                break;
581
            }
582
583 78
            $tok = $this->scanner->current();
584 78
            if ('&' == $tok) {
585 3
                $val .= $this->decodeCharacterReference(true);
586 3
                continue;
587
            }
588 78
            break;
589
        }
590 78
        $this->scanner->next();
591
592 78
        return $val;
593
    }
594
595 1
    protected function unquotedAttributeValue()
596
    {
597 1
        $val = '';
598 1
        $tok = $this->scanner->current();
599 1
        while ($tok !== false) {
600
            switch ($tok) {
601 1
                case "\n":
602 1
                case "\f":
603 1
                case " ":
604 1
                case "\t":
605 1
                case '>':
606 1
                    break 2;
607
608 1
                case '&':
609 1
                    $val .= $this->decodeCharacterReference(true);
610 1
                    $tok = $this->scanner->current();
611
612 1
                    break;
613
614 1
                case "'":
615 1
                case '"':
616 1
                case '<':
617 1
                case '=':
618 1
                case '`':
619 1
                    $this->parseError('Unexpected chars in unquoted attribute value %s', $tok);
620 1
                    $val .= $tok;
621 1
                    $tok = $this->scanner->next();
622 1
                    break;
623
624 1
                default:
625 1
                    $val .= $this->scanner->charsUntil("\t\n\f >&\"'<=`");
626
627 1
                    $tok = $this->scanner->current();
628 1
            }
629 1
        }
630
631 1
        return $val;
632
    }
633
634
    /**
635
     * Consume malformed markup as if it were a comment.
636
     * 8.2.4.44.
637
     *
638
     * The spec requires that the ENTIRE tag-like thing be enclosed inside of
639
     * the comment. So this will generate comments like:
640
     *
641
     * &lt;!--&lt/+foo&gt;--&gt;
642
     *
643
     * @param string $leading Prepend any leading characters. This essentially
644
     *                        negates the need to backtrack, but it's sort of a hack.
645
     *
646
     * @return bool
647
     */
648 3
    protected function bogusComment($leading = '')
649
    {
650 3
        $comment = $leading;
651 3
        $tokens = $this->scanner->charsUntil('>');
652 3
        if (false !== $tokens) {
653 2
            $comment .= $tokens;
654 2
        }
655 3
        $tok = $this->scanner->current();
656 3
        if (false !== $tok) {
657 2
            $comment .= $tok;
658 2
        }
659
660 3
        $this->flushBuffer();
661 3
        $this->events->comment($comment);
662 3
        $this->scanner->next();
663
664 3
        return true;
665
    }
666
667
    /**
668
     * Read a comment.
669
     * Expects the first tok to be inside of the comment.
670
     *
671
     * @return bool
672
     */
673 6
    protected function comment()
674
    {
675 6
        $tok = $this->scanner->current();
676 6
        $comment = '';
677
678
        // <!-->. Emit an empty comment because 8.2.4.46 says to.
679 6
        if ('>' == $tok) {
680
            // Parse error. Emit the comment token.
681 1
            $this->parseError("Expected comment data, got '>'");
682 1
            $this->events->comment('');
683 1
            $this->scanner->next();
684
685 1
            return true;
686
        }
687
688
        // Replace NULL with the replacement char.
689 6
        if ("\0" == $tok) {
690
            $tok = UTF8Utils::FFFD;
691
        }
692 6
        while (!$this->isCommentEnd()) {
693 6
            $comment .= $tok;
694 6
            $tok = $this->scanner->next();
695 6
        }
696
697 6
        $this->events->comment($comment);
698 6
        $this->scanner->next();
699
700 6
        return true;
701
    }
702
703
    /**
704
     * Check if the scanner has reached the end of a comment.
705
     *
706
     * @return bool
707
     */
708 6
    protected function isCommentEnd()
709
    {
710 6
        $tok = $this->scanner->current();
711
712
        // EOF
713 6
        if (false === $tok) {
714
            // Hit the end.
715 1
            $this->parseError('Unexpected EOF in a comment.');
716
717 1
            return true;
718
        }
719
720
        // If it doesn't start with -, not the end.
721 6
        if ('-' != $tok) {
722 6
            return false;
723
        }
724
725
        // Advance one, and test for '->'
726 6
        if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) {
727 6
            $this->scanner->next(); // Consume the last '>'
728 6
            return true;
729
        }
730
        // Unread '-';
731 2
        $this->scanner->unconsume(1);
732
733 2
        return false;
734
    }
735
736
    /**
737
     * Parse a DOCTYPE.
738
     *
739
     * Parse a DOCTYPE declaration. This method has strong bearing on whether or
740
     * not Quirksmode is enabled on the event handler.
741
     *
742
     * @todo This method is a little long. Should probably refactor.
743
     *
744
     * @return bool
745
     */
746 96
    protected function doctype()
747
    {
748 96
        if (strcasecmp($this->scanner->current(), 'D')) {
749
            return false;
750
        }
751
        // Check that string is DOCTYPE.
752 96
        $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
753 96
        if (strcasecmp($chars, 'DOCTYPE')) {
754 1
            $this->parseError('Expected DOCTYPE, got %s', $chars);
755
756 1
            return $this->bogusComment('<!' . $chars);
757
        }
758
759 95
        $this->scanner->whitespace();
760 95
        $tok = $this->scanner->current();
761
762
        // EOF: die.
763 95
        if (false === $tok) {
764
            $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
765
766
            return $this->eof($tok);
767
        }
768
769
        // NULL char: convert.
770 95
        if ("\0" === $tok) {
771
            $this->parseError('Unexpected null character in DOCTYPE.');
772
        }
773
774 95
        $stop = " \n\f>";
775 95
        $doctypeName = $this->scanner->charsUntil($stop);
776
        // Lowercase ASCII, replace \0 with FFFD
777 95
        $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
0 ignored issues
show
Security Bug introduced by
It seems like $doctypeName can also be of type false; however, strtr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
778
779 95
        $tok = $this->scanner->current();
780
781
        // If false, emit a parse error, DOCTYPE, and return.
782 95
        if (false === $tok) {
783 1
            $this->parseError('Unexpected EOF in DOCTYPE declaration.');
784 1
            $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
785
786 1
            return true;
787
        }
788
789
        // Short DOCTYPE, like <!DOCTYPE html>
790 95
        if ('>' == $tok) {
791
            // DOCTYPE without a name.
792 95
            if (0 == strlen($doctypeName)) {
793 1
                $this->parseError('Expected a DOCTYPE name. Got nothing.');
794 1
                $this->events->doctype($doctypeName, 0, null, true);
795 1
                $this->scanner->next();
796
797 1
                return true;
798
            }
799 95
            $this->events->doctype($doctypeName);
800 95
            $this->scanner->next();
801
802 95
            return true;
803
        }
804 1
        $this->scanner->whitespace();
805
806 1
        $pub = strtoupper($this->scanner->getAsciiAlpha());
807 1
        $white = $this->scanner->whitespace();
808
809
        // Get ID, and flag it as pub or system.
810 1
        if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) {
811
            // Get the sys ID.
812 1
            $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
813 1
            $id = $this->quotedString("\0>");
814 1
            if (false === $id) {
815
                $this->events->doctype($doctypeName, $type, $pub, false);
816
817
                return false;
818
            }
819
820
            // Premature EOF.
821 1
            if (false === $this->scanner->current()) {
822 1
                $this->parseError('Unexpected EOF in DOCTYPE');
823 1
                $this->events->doctype($doctypeName, $type, $id, true);
824
825 1
                return true;
826
            }
827
828
            // Well-formed complete DOCTYPE.
829 1
            $this->scanner->whitespace();
830 1
            if ('>' == $this->scanner->current()) {
831 1
                $this->events->doctype($doctypeName, $type, $id, false);
832 1
                $this->scanner->next();
833
834 1
                return true;
835
            }
836
837
            // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
838
            // Throw away the junk, parse error, quirks mode, return true.
839 1
            $this->scanner->charsUntil('>');
840 1
            $this->parseError('Malformed DOCTYPE.');
841 1
            $this->events->doctype($doctypeName, $type, $id, true);
842 1
            $this->scanner->next();
843
844 1
            return true;
845
        }
846
847
        // Else it's a bogus DOCTYPE.
848
        // Consume to > and trash.
849 1
        $this->scanner->charsUntil('>');
850
851 1
        $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub);
852 1
        $this->events->doctype($doctypeName, 0, null, true);
853 1
        $this->scanner->next();
854
855 1
        return true;
856
    }
857
858
    /**
859
     * Utility for reading a quoted string.
860
     *
861
     * @param string $stopchars Characters (in addition to a close-quote) that should stop the string.
862
     *                          E.g. sometimes '>' is higher precedence than '"' or "'".
863
     *
864
     * @return mixed String if one is found (quotations omitted).
865
     */
866 1
    protected function quotedString($stopchars)
867
    {
868 1
        $tok = $this->scanner->current();
869 1
        if ('"' == $tok || "'" == $tok) {
870 1
            $this->scanner->next();
871 1
            $ret = $this->scanner->charsUntil($tok . $stopchars);
872 1
            if ($this->scanner->current() == $tok) {
873 1
                $this->scanner->next();
874 1
            } else {
875
                // Parse error because no close quote.
876
                $this->parseError('Expected %s, got %s', $tok, $this->scanner->current());
877
            }
878
879 1
            return $ret;
880
        }
881
882
        return false;
883
    }
884
885
    /**
886
     * Handle a CDATA section.
887
     *
888
     * @return bool
889
     */
890 7
    protected function cdataSection()
891
    {
892 7
        if ('[' != $this->scanner->current()) {
893
            return false;
894
        }
895 7
        $cdata = '';
896 7
        $this->scanner->next();
897
898 7
        $chars = $this->scanner->charsWhile('CDAT');
899 7
        if ('CDATA' != $chars || '[' != $this->scanner->current()) {
900 1
            $this->parseError('Expected [CDATA[, got %s', $chars);
901
902 1
            return $this->bogusComment('<![' . $chars);
903
        }
904
905 7
        $tok = $this->scanner->next();
906
        do {
907 7
            if (false === $tok) {
908 2
                $this->parseError('Unexpected EOF inside CDATA.');
909 2
                $this->bogusComment('<![CDATA[' . $cdata);
910
911 2
                return true;
912
            }
913 7
            $cdata .= $tok;
914 7
            $tok = $this->scanner->next();
915 7
        } while (!$this->scanner->sequenceMatches(']]>'));
916
917
        // Consume ]]>
918 5
        $this->scanner->consume(3);
919
920 5
        $this->events->cdata($cdata);
921
922 5
        return true;
923
    }
924
925
    // ================================================================
926
    // Non-HTML5
927
    // ================================================================
928
929
    /**
930
     * Handle a processing instruction.
931
     *
932
     * XML processing instructions are supposed to be ignored in HTML5,
933
     * treated as "bogus comments". However, since we're not a user
934
     * agent, we allow them. We consume until ?> and then issue a
935
     * EventListener::processingInstruction() event.
936
     *
937
     * @return bool
938
     */
939 119
    protected function processingInstruction()
940
    {
941 119
        if ('?' != $this->scanner->current()) {
942 114
            return false;
943
        }
944
945 7
        $tok = $this->scanner->next();
946 7
        $procName = $this->scanner->getAsciiAlpha();
947 7
        $white = $this->scanner->whitespace();
948
949
        // If not a PI, send to bogusComment.
950 7
        if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) {
951 1
            $this->parseError("Expected processing instruction name, got $tok");
952 1
            $this->bogusComment('<?' . $tok . $procName);
953
954 1
            return true;
955
        }
956
957 6
        $data = '';
958
        // As long as it's not the case that the next two chars are ? and >.
959 6
        while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) {
960 6
            $data .= $this->scanner->current();
961
962 6
            $tok = $this->scanner->next();
963 6
            if (false === $tok) {
964
                $this->parseError('Unexpected EOF in processing instruction.');
965
                $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 946 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
966
967
                return true;
968
            }
969 6
        }
970
971 6
        $this->scanner->next(); // >
972 6
        $this->scanner->next(); // Next token.
973 6
        $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 946 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
974
975 6
        return true;
976
    }
977
978
    // ================================================================
979
    // UTILITY FUNCTIONS
980
    // ================================================================
981
982
    /**
983
     * Read from the input stream until we get to the desired sequene
984
     * or hit the end of the input stream.
985
     *
986
     * @param string $sequence
987
     *
988
     * @return string
989
     */
990 8
    protected function readUntilSequence($sequence)
991
    {
992 8
        $buffer = '';
993
994
        // Optimization for reading larger blocks faster.
995 8
        $first = substr($sequence, 0, 1);
996 8
        while (false !== $this->scanner->current()) {
997 8
            $buffer .= $this->scanner->charsUntil($first);
998
999
            // Stop as soon as we hit the stopping condition.
1000 8
            if ($this->scanner->sequenceMatches($sequence, false)) {
1001 8
                return $buffer;
1002
            }
1003 4
            $buffer .= $this->scanner->current();
1004 4
            $this->scanner->next();
1005 4
        }
1006
1007
        // If we get here, we hit the EOF.
1008 1
        $this->parseError('Unexpected EOF during text read.');
1009
1010 1
        return $buffer;
1011
    }
1012
1013
    /**
1014
     * Check if upcomming chars match the given sequence.
1015
     *
1016
     * This will read the stream for the $sequence. If it's
1017
     * found, this will return true. If not, return false.
1018
     * Since this unconsumes any chars it reads, the caller
1019
     * will still need to read the next sequence, even if
1020
     * this returns true.
1021
     *
1022
     * Example: $this->scanner->sequenceMatches('</script>') will
1023
     * see if the input stream is at the start of a
1024
     * '</script>' string.
1025
     *
1026
     * @param string $sequence
1027
     * @param bool   $caseSensitive
1028
     *
1029
     * @return bool
1030
     */
1031
    protected function sequenceMatches($sequence, $caseSensitive = true)
1032
    {
1033
        @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
1034
1035
        return $this->scanner->sequenceMatches($sequence, $caseSensitive);
1036
    }
1037
1038
    /**
1039
     * Send a TEXT event with the contents of the text buffer.
1040
     *
1041
     * This emits an EventHandler::text() event with the current contents of the
1042
     * temporary text buffer. (The buffer is used to group as much PCDATA
1043
     * as we can instead of emitting lots and lots of TEXT events.)
1044
     */
1045 127
    protected function flushBuffer()
1046
    {
1047 127
        if ('' === $this->text) {
1048 125
            return;
1049
        }
1050 87
        $this->events->text($this->text);
1051 87
        $this->text = '';
1052 87
    }
1053
1054
    /**
1055
     * Add text to the temporary buffer.
1056
     *
1057
     * @see flushBuffer()
1058
     *
1059
     * @param string $str
1060
     */
1061 9
    protected function buffer($str)
1062
    {
1063 9
        $this->text .= $str;
1064 9
    }
1065
1066
    /**
1067
     * Emit a parse error.
1068
     *
1069
     * A parse error always returns false because it never consumes any
1070
     * characters.
1071
     *
1072
     * @param string $msg
1073
     *
1074
     * @return string
1075
     */
1076 15
    protected function parseError($msg)
1077
    {
1078 15
        $args = func_get_args();
1079
1080 15
        if (count($args) > 1) {
1081 11
            array_shift($args);
1082 11
            $msg = vsprintf($msg, $args);
1083 11
        }
1084
1085 15
        $line = $this->scanner->currentLine();
1086 15
        $col = $this->scanner->columnOffset();
1087 15
        $this->events->parseError($msg, $line, $col);
1088
1089 15
        return false;
1090
    }
1091
1092
    /**
1093
     * Decode a character reference and return the string.
1094
     *
1095
     * If $inAttribute is set to true, a bare & will be returned as-is.
1096
     *
1097
     * @param bool $inAttribute Set to true if the text is inside of an attribute value.
1098
     *                          false otherwise.
1099
     *
1100
     * @return string
1101
     */
1102 12
    protected function decodeCharacterReference($inAttribute = false)
1103
    {
1104
        // Next char after &.
1105 12
        $tok = $this->scanner->next();
1106 12
        $start = $this->scanner->position();
1107
1108 12
        if (false === $tok) {
1109 1
            return '&';
1110
        }
1111
1112
        // These indicate not an entity. We return just
1113
        // the &.
1114 12
        if (1 === strspn($tok, static::WHITE . '&<')) {
1115
            // $this->scanner->next();
1116 2
            return '&';
1117
        }
1118
1119
        // Numeric entity
1120 12
        if ('#' === $tok) {
1121 2
            $tok = $this->scanner->next();
1122
1123
            // Hexidecimal encoding.
1124
            // X[0-9a-fA-F]+;
1125
            // x[0-9a-fA-F]+;
1126 2
            if ('x' === $tok || 'X' === $tok) {
1127 2
                $tok = $this->scanner->next(); // Consume x
1128
1129
                // Convert from hex code to char.
1130 2
                $hex = $this->scanner->getHex();
1131 2
                if (empty($hex)) {
1132
                    $this->parseError('Expected &#xHEX;, got &#x%s', $tok);
1133
                    // We unconsume because we don't know what parser rules might
1134
                    // be in effect for the remaining chars. For example. '&#>'
1135
                    // might result in a specific parsing rule inside of tag
1136
                    // contexts, while not inside of pcdata context.
1137
                    $this->scanner->unconsume(2);
1138
1139
                    return '&';
1140
                }
1141 2
                $entity = CharacterReference::lookupHex($hex);
1142 2
            }             // Decimal encoding.
1143
            // [0-9]+;
1144
            else {
1145
                // Convert from decimal to char.
1146 1
                $numeric = $this->scanner->getNumeric();
1147 1
                if (false === $numeric) {
1148
                    $this->parseError('Expected &#DIGITS;, got &#%s', $tok);
1149
                    $this->scanner->unconsume(2);
1150
1151
                    return '&';
1152
                }
1153 1
                $entity = CharacterReference::lookupDecimal($numeric);
1154
            }
1155 12
        } elseif ('=' === $tok && $inAttribute) {
1156 1
            return '&';
1157
        } else { // String entity.
1158
            // Attempt to consume a string up to a ';'.
1159
            // [a-zA-Z0-9]+;
1160 11
            $cname = $this->scanner->getAsciiAlphaNum();
1161 11
            $entity = CharacterReference::lookupName($cname);
0 ignored issues
show
Security Bug introduced by
It seems like $cname defined by $this->scanner->getAsciiAlphaNum() on line 1160 can also be of type false; however, Masterminds\HTML5\Parser...Reference::lookupName() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1162
1163
            // When no entity is found provide the name of the unmatched string
1164
            // and continue on as the & is not part of an entity. The & will
1165
            // be converted to &amp; elsewhere.
1166 11
            if (null === $entity) {
1167 6
                if (!$inAttribute || '' === $cname) {
1168 5
                    $this->parseError("No match in entity table for '%s'", $cname);
1169 5
                }
1170 6
                $this->scanner->unconsume($this->scanner->position() - $start);
1171
1172 6
                return '&';
1173
            }
1174
        }
1175
1176
        // The scanner has advanced the cursor for us.
1177 9
        $tok = $this->scanner->current();
1178
1179
        // We have an entity. We're done here.
1180 9
        if (';' === $tok) {
1181 9
            $this->scanner->next();
1182
1183 9
            return $entity;
1184
        }
1185
1186
        // If in an attribute, then failing to match ; means unconsume the
1187
        // entire string. Otherwise, failure to match is an error.
1188 1
        if ($inAttribute) {
1189
            $this->scanner->unconsume($this->scanner->position() - $start);
1190
1191
            return '&';
1192
        }
1193
1194 1
        $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok);
1195
1196 1
        return '&' . $entity;
1197
    }
1198
}
1199